import pandas as pd
import numpy as np
from sklearn import datasets
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestCentroid
import pickle
import math
# addtional files
import lion_tsne
import input_data
PALETTE = sns.color_palette('deep', n_colors=3)
CMAP = ListedColormap(PALETTE.as_hex())
RANDOM_STATE = 42
data_iris = datasets.load_iris()
X_iris = data_iris.data
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris = pd.DataFrame(X_iris, columns=features)
iris['species'] = data_iris.target
iris.head()
def plot_iris_2d(x, y, title, xlabel="1st eigenvector", ylabel="2nd eigenvector",colors=iris['species']):
plt.gcf().set_size_inches(10,10)
sns.set_style("darkgrid")
plt.scatter(x, y,
c=colors,
cmap=CMAP,
s=70)
plt.title(title, fontsize=20, y=1.03)
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)
def plot_iris_3d(x, y, z, title):
sns.set_style('whitegrid')
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
ax.scatter(x, y, z,
c=iris['species'],
cmap=CMAP,
s=40)
ax.set_title(title, fontsize=20, y=1.03)
fsize = 14
ax.set_xlabel("1st eigenvector", fontsize=fsize)
ax.set_ylabel("2nd eigenvector", fontsize=fsize)
ax.set_zlabel("3rd eigenvector", fontsize=fsize)
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
plot_iris_2d(
x = iris['sepal_length'],
y = iris['sepal_width'],
title = 'Plotting first two components',
xlabel = 'Sepal length',
ylabel = 'Sepal width')
np.random.seed(0)
ind = np.random.choice(np.arange(data_iris.data.shape[0]), size = 120)
X_iris_random = data_iris.data[ind]
y_iris_random = data_iris.target[ind]
iris_random_df = pd.DataFrame(X_iris_random, columns=features)
iris_random_df['target'] = y_iris_random
iris_random_df['target'].value_counts()
def euclidean_distance(point1, point2):
distance = 0.0
for i in range(len(point1)-1):
distance += (point1[i] - point2[i])**2
return math.sqrt(distance)
def get_neighbors(train, test_row, num_neighbors, class_type):
distances = list()
for train_row in train:
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda tup: tup[1])
result_df = pd.DataFrame(columns=features)
for i in range(num_neighbors):
tmp_df = pd.DataFrame([distances[i][0]], columns=features)
result_df = result_df.append(tmp_df, ignore_index=True)
result_df['target'] = class_type
return result_df
clf = NearestCentroid()
clf.fit(data_iris.data, data_iris.target)
clf.centroids_
iris_target_values = np.unique(data_iris.target)
iris_knn_df = pd.DataFrame()
for i in iris_target_values:
iris_knn_df = iris_knn_df.append(get_neighbors(data_iris.data, clf.centroids_[i], 40, str(i)), ignore_index=True)
iris_knn_df['target'].value_counts()
tSNE
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(iris_random_df.loc[:, features].values)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS random sampling with t-SNE',
colors=iris_random_df.loc[:, ['target']].values)
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(iris_knn_df.loc[:, features].values)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS kNN sampling with t-SNE',
colors=iris_knn_df.loc[:, ['target']].values)
%%capture
lionTSNE_iris = lion_tsne.LionTSNE(perplexity=30)
layers = lionTSNE_iris.fit(iris_random_df.loc[:, features].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS random sampling with LION t-SNE',
colors=iris_random_df.loc[:, ['target']].values)
%%capture
lionTSNE_iris = lion_tsne.LionTSNE(perplexity=30)
layers = lionTSNE_iris.fit(iris_knn_df.loc[:, features].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS kNN sampling with LION t-SNE',
colors=iris_knn_df.loc[:, ['target']].values)
%%capture
iris_pca = PCA(n_components=4)
X_iris_pca = iris_pca.fit_transform(iris_random_df.loc[:, features].values)
lionTSNE_iris = lion_tsne.LionTSNE(perplexity=30)
layers = lionTSNE_iris.fit(X_iris_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS random sampling with LION t-SNE and PCA',
colors=iris_random_df.loc[:, ['target']].values)
%%capture
iris_pca = PCA(n_components=4)
X_iris_pca = iris_pca.fit_transform(iris_knn_df.loc[:, features].values)
lionTSNE_iris = lion_tsne.LionTSNE(perplexity=30)
layers = lionTSNE_iris.fit(X_iris_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS kNN sampling with LION t-SNE and PCA',
colors=iris_knn_df.loc[:, ['target']].values)
%%capture
iris_mds = MDS(n_components=4)
X_iris_mds = iris_mds.fit_transform(iris_random_df.loc[:, features].values)
lionTSNE_iris = lion_tsne.LionTSNE(perplexity=30)
layers = lionTSNE_iris.fit(X_iris_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS random sampling with LION t-SNE and MDS',
colors=iris_random_df.loc[:, ['target']].values)
%%capture
iris_mds = MDS(n_components=4)
X_iris_mds = iris_mds.fit_transform(iris_knn_df.loc[:, features].values)
lionTSNE_iris = lion_tsne.LionTSNE(perplexity=30)
layers = lionTSNE_iris.fit(X_iris_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
plot_iris_2d(
x = layers[:, 0],
y = layers[:, 1],
title = 'IRIS random sampling with LION t-SNE and MDS',
colors=iris_knn_df.loc[:, ['target']].values)
def plot_mnist_2d(df, title, xlabel="1st eigenvector", ylabel="2nd eigenvector"):
plt.gcf().set_size_inches(20,15)
sns.set_style("darkgrid")
legend_list = list()
for d in data_digits.target_names:
plt.scatter(df[df['target'] == str(d)]['1st eigenvector'], df[df['target'] == str(d)]['2nd eigenvector'])
legend_list.append(str(d))
plt.title(title, fontsize=20, y=1.03)
plt.legend(legend_list)
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)
%%capture
data_digits = datasets.load_digits()
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
all_mnist_trained_images = mnist.train.images
all_mnist_labels = mnist.train.labels
np.random.seed(RANDOM_STATE)
ind = np.random.choice(np.arange(len(mnist.train.images)), size = 2000)
mnist_chosen_indices = ind
X_mnist_rand = mnist.train.images[ind]
y_mnist_raw = mnist.train.labels[ind]
y_mnist_rand = [np.where(r==1)[0][0] for r in y_mnist_raw]
mnist_random_df = pd.DataFrame(X_mnist_rand)
mnist_random_df['target'] = y_mnist_rand
mnist_random_df['target'].value_counts()
kNN sampling
def get_mnist_neighbors(train, test_row, num_neighbors, class_type):
distances = list()
for train_row in train:
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda tup: tup[1])
result_df = pd.DataFrame()
for i in range(num_neighbors):
tmp_df = pd.DataFrame([distances[i][0]])
result_df = result_df.append(tmp_df, ignore_index=True)
result_df['target'] = class_type
return result_df
clf = NearestCentroid()
mnist_train_labels = [np.where(r==1)[0][0] for r in mnist.train.labels[:12000]]
clf.fit(mnist.train.images[:12000], mnist_train_labels)
clf.centroids_
mnist_target_values = np.unique(mnist_train_labels)
mnist_knn_df = pd.DataFrame()
for i in mnist_target_values:
mnist_knn_df = mnist_knn_df.append(get_mnist_neighbors(mnist.train.images[:12000], clf.centroids_[i], 200, str(i)), ignore_index=True)
mnist_knn_df['target'].value_counts()
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(mnist_random_df.loc[:, mnist_random_df.columns != 'target'].values)
mnist_df = pd.DataFrame(layers, columns=['1st eigenvector', '2nd eigenvector'])
mnist_df['target'] = mnist_random_df['target'].values
plot_mnist_2d(
mnist_df,
title = 'MNIST tSNE with random sampling')
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(mnist_knn_df.loc[:, mnist_knn_df.columns != 'target'].values)
mnist_df = pd.DataFrame(layers, columns=['1st eigenvector', '2nd eigenvector'])
mnist_df['target'] = mnist_knn_df['target'].values
plot_mnist_2d(
mnist_df,
title = 'MNIST tSNE with kNN sampling')
%%capture
lionTSNE_mnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_mnist = lionTSNE_mnist.fit(mnist_random_df.loc[:, mnist_random_df.columns != 'target'].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionrn_mnist_df = pd.DataFrame(Y_lionTSNE_mnist, columns=['1st eigenvector', '2nd eigenvector'])
lionrn_mnist_df['target'] = mnist_random_df['target'].values
plot_mnist_2d(
lionrn_mnist_df,
title = 'MNIST LION tSNE with random sampling')
%%capture
lionTSNE_mnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_knn_mnist = lionTSNE_mnist.fit(mnist_knn_df.loc[:, mnist_knn_df.columns != 'target'].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionknn_mnist_df = pd.DataFrame(Y_lionTSNE_knn_mnist, columns=['1st eigenvector', '2nd eigenvector'])
lionknn_mnist_df['target'] = mnist_knn_df['target'].values
plot_mnist_2d(
lionknn_mnist_df,
title = 'MNIST LION tSNE with kNN sampling')
%%capture
mnist_mds = MDS(n_components=2)
X_mnist_mds = mnist_mds.fit_transform(mnist_random_df.loc[:, mnist_random_df.columns != 'target'].values)
lionTSNE_mnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_mds_rn_mnist = lionTSNE_mnist.fit(X_mnist_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
mnist_lion_mds_df = pd.DataFrame(Y_lionTSNE_mds_rn_mnist, columns=['1st eigenvector', '2nd eigenvector'])
mnist_lion_mds_df['target'] = mnist_random_df['target'].values
plot_mnist_2d(
mnist_lion_mds_df,
title = 'MNIST LION tSNE with MDS and random sampling')
%%capture
mnist_mds = MDS(n_components=2)
X_mnist_mds = mnist_mds.fit_transform(mnist_knn_df.loc[:, mnist_knn_df.columns != 'target'].values)
lionTSNE_mnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_mds_knn_mnist = lionTSNE_mnist.fit(X_mnist_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
mnist_lion_knn_mds_df = pd.DataFrame(Y_lionTSNE_mds_knn_mnist, columns=['1st eigenvector', '2nd eigenvector'])
mnist_lion_knn_mds_df['target'] = mnist_knn_df['target'].values
plot_mnist_2d(
mnist_lion_knn_mds_df,
title = 'MNIST LION tSNE with MDS and knn sampling')
%%capture
mnist_pca = PCA(n_components=2)
X_mnist_pca = mnist_pca.fit_transform(mnist_random_df.loc[:, mnist_random_df.columns != 'target'].values)
lionTSNE_mnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_pca_rn_mnist = lionTSNE_mnist.fit(X_mnist_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
mnist_lion_pca_df = pd.DataFrame(Y_lionTSNE_pca_rn_mnist, columns=['1st eigenvector', '2nd eigenvector'])
mnist_lion_pca_df['target'] = mnist_random_df['target'].values
plot_mnist_2d(
mnist_lion_pca_df,
title = 'MNIST LION tSNE with PCA and random sampling')
%%capture
mnist_pca = PCA(n_components=2)
X_mnist_pca = mnist_pca.fit_transform(mnist_knn_df.loc[:, mnist_knn_df.columns != 'target'].values)
lionTSNE_mnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_pca_knn_mnist = lionTSNE_mnist.fit(X_mnist_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
mnist_lion_knn_pca_df = pd.DataFrame(Y_lionTSNE_pca_knn_mnist, columns=['1st eigenvector', '2nd eigenvector'])
mnist_lion_knn_pca_df['target'] = mnist_knn_df['target'].values
plot_mnist_2d(
mnist_lion_knn_pca_df,
title = 'MNIST LION tSNE with PCA and knn sampling')
df_fmnist = pd.read_csv('datasets/fashion-mnist.csv')
df_images = df_fmnist[:25]
df_images = df_images.drop(columns=['2.5'])
df_images /= 255
images = df_images.values.reshape(25, 28, 28)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
for i in range(25):
plt.subplot(5 , 5, i+1)
plt.imshow(images[i], cmap=plt.get_cmap('gray'))
fmnist_train = df_fmnist[:12000]
np.random.seed(0)
ind = np.random.choice(np.arange(12000), size = 2000)
X_fmnist_rand = fmnist_train.loc[ind]
X_fmnist_rand = X_fmnist_rand.drop(columns=['2.5'])
X_fmnist_rand /= 255
X_fmnist_rand.reset_index(drop=True, inplace=True)
y_fmnist_rand = fmnist_train.loc[ind, '2.5']
y_fmnist_rand.reset_index(drop=True, inplace=True)
X_fmnist_rand
y_fmnist_rand
y_fmnist_rand.value_counts()
fmnist_random_df = X_fmnist_rand
fmnist_random_df['target'] = y_fmnist_rand.values
fmnist_random_df
def get_fmnist_neighbors(train, test_row, num_neighbors, class_type):
distances = list()
for train_row in train:
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda tup: tup[1])
result_df = pd.DataFrame()
for i in range(num_neighbors):
tmp_df = pd.DataFrame([distances[i][0]])
result_df = result_df.append(tmp_df, ignore_index=True)
result_df['target'] = class_type
return result_df
X_fmnist = fmnist_train
X_fmnist = X_fmnist.drop(columns=['2.5'])
X_fmnist /= 255
y_fmnist = fmnist_train['2.5']
clf = NearestCentroid()
clf.fit(X_fmnist, y_fmnist)
clf.centroids_
fmnist_target_values = np.unique(y_fmnist)
fmnist_knn_df = pd.DataFrame()
for i in fmnist_target_values:
fmnist_knn_df = fmnist_knn_df.append(get_fmnist_neighbors(X_fmnist.values, clf.centroids_[int(i)], 200, str(i)), ignore_index=True)
fmnist_knn_df['target'].value_counts()
def plot_fmnist_2d(df, title, xlabel="1st eigenvector", ylabel="2nd eigenvector"):
plt.gcf().set_size_inches(20,15)
sns.set_style("darkgrid")
legend_list = list()
for d in data_digits.target_names:
plt.scatter(df[df['target'] == str(d)]['1st eigenvector'], df[df['target'] == str(d)]['2nd eigenvector'])
#plt.scatter(df[df['target'] == d]['1st eigenvector'], df[df['target'] == d]['2nd eigenvector'])
legend_list.append(str(d))
plt.title(title, fontsize=20, y=1.03)
plt.legend(legend_list)
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(fmnist_random_df.loc[:, fmnist_random_df.columns != 'target'].values)
fmnist_df = pd.DataFrame(layers, columns=['1st eigenvector', '2nd eigenvector'])
fmnist_df['target'] = fmnist_random_df['target'].values
plot_fmnist_2d(
fmnist_df,
title = 'FMNIST tSNE with random sampling')
tSNE with knn sampling
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(fmnist_knn_df.loc[:, fmnist_knn_df.columns != 'target'].values)
fmnist_df = pd.DataFrame(layers, columns=['1st eigenvector', '2nd eigenvector'])
fmnist_df['target'] = fmnist_knn_df['target'].values
plot_fmnist_2d(
fmnist_df,
title = 'FMNIST tSNE with knn sampling')
%%capture
lionTSNE_fmnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_fmnist = lionTSNE_fmnist.fit(fmnist_random_df.loc[:, fmnist_random_df.columns != 'target'].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionrn_fmnist_df = pd.DataFrame(Y_lionTSNE_fmnist, columns=['1st eigenvector', '2nd eigenvector'])
lionrn_fmnist_df['target'] = fmnist_random_df['target'].values
plot_fmnist_2d(
lionrn_fmnist_df,
title = 'FMNIST LION tSNE with random sampling')
%%capture
lionTSNE_fmnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_fmnist = lionTSNE_fmnist.fit(fmnist_knn_df.loc[:, fmnist_knn_df.columns != 'target'].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionknn_fmnist_df = pd.DataFrame(Y_lionTSNE_fmnist, columns=['1st eigenvector', '2nd eigenvector'])
lionknn_fmnist_df['target'] = fmnist_knn_df['target'].values
plot_fmnist_2d(
lionknn_fmnist_df,
title = 'FMNIST LION tSNE with knn sampling')
%%capture
fmnist_mds = MDS(n_components=2)
X_fmnist_mds = fmnist_mds.fit_transform(fmnist_random_df.loc[:, fmnist_random_df.columns != 'target'].values)
lionTSNE_fmnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_fmnist = lionTSNE_fmnist.fit(X_fmnist_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionrn_mds_fmnist_df = pd.DataFrame(Y_lionTSNE_fmnist, columns=['1st eigenvector', '2nd eigenvector'])
lionrn_mds_fmnist_df['target'] = fmnist_random_df['target'].values
plot_fmnist_2d(
lionrn_mds_fmnist_df,
title = 'FMNIST LION tSNE with MDS and random sampling')
%%capture
fmnist_mds = MDS(n_components=2)
X_fmnist_mds = fmnist_mds.fit_transform(fmnist_knn_df.loc[:, fmnist_knn_df.columns != 'target'].values)
lionTSNE_fmnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_fmnist = lionTSNE_fmnist.fit(X_fmnist_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionknn_mds_fmnist_df = pd.DataFrame(Y_lionTSNE_fmnist, columns=['1st eigenvector', '2nd eigenvector'])
lionknn_mds_fmnist_df['target'] = fmnist_knn_df['target'].values
plot_fmnist_2d(
lionknn_mds_fmnist_df,
title = 'FMNIST LION tSNE with MDS and knn sampling')
%%capture
fmnist_pca = PCA(n_components=2)
X_fmnist_pca = fmnist_pca.fit_transform(fmnist_random_df.loc[:, fmnist_random_df.columns != 'target'].values)
lionTSNE_fmnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_pca_rn_fmnist = lionTSNE_fmnist.fit(X_fmnist_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionrn_pca_fmnist_df = pd.DataFrame(Y_lionTSNE_pca_rn_fmnist, columns=['1st eigenvector', '2nd eigenvector'])
lionrn_pca_fmnist_df['target'] = fmnist_random_df['target'].values
plot_fmnist_2d(
lionrn_pca_fmnist_df,
title = 'FMNIST LION tSNE with PCA and random sampling')
%%capture
fmnist_pca = PCA(n_components=2)
X_fmnist_pca = fmnist_pca.fit_transform(fmnist_knn_df.loc[:, fmnist_knn_df.columns != 'target'].values)
lionTSNE_fmnist = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_pca_knn_fmnist = lionTSNE_fmnist.fit(X_fmnist_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lionknn_pca_fmnist_df = pd.DataFrame(Y_lionTSNE_pca_knn_fmnist, columns=['1st eigenvector', '2nd eigenvector'])
lionknn_pca_fmnist_df['target'] = fmnist_knn_df['target'].values
plot_fmnist_2d(
lionknn_pca_fmnist_df,
title = 'FMNIST LION tSNE with PCA and knn sampling')
df_rcv = pd.read_csv('datasets/rcv.csv')
rcv_train = df_rcv[:2000]
rcv_train = rcv_train.rename(columns={"3": "target",})
def plot_2d(df, title, labels, xlabel="1st eigenvector", ylabel="2nd eigenvector"):
plt.gcf().set_size_inches(20,15)
sns.set_style("darkgrid")
legend_list = list()
for d in labels:
plt.scatter(df[df['target'] == d]['1st eigenvector'], df[df['target'] == d]['2nd eigenvector'])
#plt.scatter(df[df['target'] == d]['1st eigenvector'], df[df['target'] == d]['2nd eigenvector'])
legend_list.append(str(d))
plt.title(title, fontsize=20, y=1.03)
plt.legend(legend_list)
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(rcv_train.loc[:, rcv_train.columns != 'target'].values)
tsne_rcv_df = pd.DataFrame(layers, columns=['1st eigenvector', '2nd eigenvector'])
tsne_rcv_df['target'] = rcv_train['target'].values
plot_2d(
tsne_rcv_df,
title = 'Reuters tSNE',
labels=np.unique(tsne_rcv_df['target']))
%%capture
lionTSNE_rcv = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_rcv = lionTSNE_rcv.fit(rcv_train.loc[:, rcv_train.columns != 'target'].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lion_rcv_df = pd.DataFrame(Y_lionTSNE_rcv, columns=['1st eigenvector', '2nd eigenvector'])
lion_rcv_df['target'] = rcv_train['target'].values
plot_2d(
lion_rcv_df,
title = 'Reuters LION tSNE',
labels=np.unique(tsne_rcv_df['target']))
%%capture
rcv_mds = MDS(n_components=2)
X_rcv_mds = rcv_mds.fit_transform(rcv_train.loc[:, rcv_train.columns != 'target'].values)
%%capture
lionTSNE_rcv = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_mds_rcv = lionTSNE_rcv.fit(X_rcv_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lion_mds_rcv_df = pd.DataFrame(Y_lionTSNE_mds_rcv, columns=['1st eigenvector', '2nd eigenvector'])
lion_mds_rcv_df['target'] = rcv_train['target'].values
plot_2d(
lion_mds_rcv_df,
title = 'Reuters LION tSNE with MDS',
labels=np.unique(tsne_rcv_df['target']))
%%capture
rcv_pca = PCA(n_components=2)
X_rcv_pca = rcv_pca.fit_transform(rcv_train.loc[:, rcv_train.columns != 'target'].values)
%%capture
lionTSNE_rcv = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_pca_rcv = lionTSNE_rcv.fit(X_rcv_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
lion_pca_rcv_df = pd.DataFrame(Y_lionTSNE_pca_rcv, columns=['1st eigenvector', '2nd eigenvector'])
lion_pca_rcv_df['target'] = rcv_train['target'].values
plot_2d(
lion_pca_rcv_df,
title = 'Reuters LION tSNE with PCA',
labels=np.unique(tsne_rcv_df['target']))
Let's sample from Reuters dataset only this type of articles, which occurs more than 100 times. Then find 100 the nearest neighbours of each class centroid.
knn_train = df_rcv[:13000]
knn_train = knn_train.rename(columns={"3": "target"})
vc = knn_train.target.value_counts().reset_index(name="count").query("count > 100")["index"].values
knn_train = knn_train[knn_train.target.isin(vc)]
knn_train.target.value_counts()
rcv_classes = np.unique(knn_train.target)
rcv_classes
def get_rcv_neighbors(train, test_row, num_neighbors, class_type):
distances = list()
for train_row in train:
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
distances.sort(key=lambda tup: tup[1])
result_df = pd.DataFrame()
for i in range(num_neighbors):
tmp_df = pd.DataFrame([distances[i][0]])
result_df = result_df.append(tmp_df, ignore_index=True)
result_df['target'] = class_type
return result_df
X_knn_train = knn_train.drop(columns=['target'])
y_knn_train = knn_train['target']
clf = NearestCentroid()
clf.fit(X_knn_train, y_knn_train)
clf.centroids_
rcv_knn_df = pd.DataFrame()
for i in range(7):
rcv_knn_df = rcv_knn_df.append(get_rcv_neighbors(X_knn_train.values, clf.centroids_[int(i)], 300, str(rcv_classes[i])), ignore_index=True)
rcv_knn_df['target'].value_counts()
tsne = TSNE(n_components=2, n_iter=3000, random_state=RANDOM_STATE)
layers = tsne.fit_transform(rcv_knn_df.loc[:, rcv_knn_df.columns != 'target'].values)
tsne_knn_rcv_df = pd.DataFrame(layers, columns=['1st eigenvector', '2nd eigenvector'])
tsne_knn_rcv_df['target'] = rcv_knn_df['target'].values
plot_2d(
tsne_knn_rcv_df,
title = 'Reuters tSNE knn',
labels=np.unique(rcv_knn_df['target']))
%%capture
lionTSNE_rcv = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_rcv = lionTSNE_rcv.fit(rcv_knn_df.loc[:, rcv_knn_df.columns != 'target'].values, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
liontsne_knn_rcv_df = pd.DataFrame(Y_lionTSNE_rcv, columns=['1st eigenvector', '2nd eigenvector'])
liontsne_knn_rcv_df['target'] = rcv_knn_df['target'].values
plot_2d(
liontsne_knn_rcv_df,
title = 'Reuters LION tSNE knn',
labels=np.unique(rcv_knn_df['target']))
%%capture
rcv_mds = MDS(n_components=2)
X_rcv_knn_mds = rcv_mds.fit_transform(rcv_knn_df.loc[:, rcv_knn_df.columns != 'target'].values)
%%capture
lionTSNE_rcv = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_mds_rcv = lionTSNE_rcv.fit(X_rcv_knn_mds, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
liontsne_knn_mds_rcv_df = pd.DataFrame(Y_lionTSNE_mds_rcv, columns=['1st eigenvector', '2nd eigenvector'])
liontsne_knn_mds_rcv_df['target'] = rcv_knn_df['target'].values
plot_2d(
liontsne_knn_mds_rcv_df,
title = 'Reuters LION tSNE with MDS knn',
labels=np.unique(rcv_knn_df['target']))
Reuters LION tSNE with PCA and knn
%%capture
rcv_pca = PCA(n_components=2)
X_rcv_pca = rcv_pca.fit_transform(rcv_knn_df.loc[:, rcv_knn_df.columns != 'target'].values)
%%capture
lionTSNE_rcv = lion_tsne.LionTSNE(perplexity=30)
Y_lionTSNE_pca_rcv = lionTSNE_rcv.fit(X_rcv_pca, optimizer_kwargs={'momentum': 0.8, 'n_iter': 3000,
'early_exaggeration_iters' : 300}, random_seed=1, verbose=2)
liontsne_knn_pca_rcv_df = pd.DataFrame(Y_lionTSNE_pca_rcv, columns=['1st eigenvector', '2nd eigenvector'])
liontsne_knn_pca_rcv_df['target'] = rcv_knn_df['target'].values
plot_2d(
liontsne_knn_pca_rcv_df,
title = 'Reuters LION tSNE with PCA knn',
labels=np.unique(rcv_knn_df['target']))